Preprocessing Dataset
Libraries
Library for read dataset.
library(readr)Library for data frames processing.
library(dplyr)
library(tidyr)Library for R Markdown.
library(rmarkdown)
library(knitr)Library for data presentation.
library(scales)Library for manage strings.
library(stringr)Load dataset
Set the path of the dataframe file.
path = "../virusTotal/data/virusTotal.csv"Load dataset.
df <- read_csv(path)Statistics
Dimensions.
dim(df)## [1] 183 447
Types
View witch types are in the dataset.
col_types_all <-
df %>%
sapply(typeof) %>%
unlist()
col_types_table <-
col_types_all %>%
table()
col_types <-
col_types_table %>%
as.vector()
names(col_types) <- names(col_types_table)## character double logical
## 158 204 85
As can be seen there are the three expected types: character, double and logical.
NA
Percentaje of NA values
Define function to see the amount of NA values in the dataframe.
percent_of_NA <-
function(df){
num_of_NA <-
df %>% is.na() %>% sum()
num_of_values <-
df %>% dim() %>% prod()
percent_of_NA <-
(num_of_NA / num_of_values) %>%
percent()
return(percent_of_NA)
}percent_of_NA(df)## [1] "37%"
Columns with NA
Define functions to see the NA in columns.
num_of_NA_by_column <-
function(df){
df %>% is.na() %>% colSums()
}remove_0 <-
function(x) x[x!=0]names_of_colums_with_NA <-
function(df)
df %>%
num_of_NA_by_column() %>%
remove_0 %>%
names()percentaje_of_cols_with_NA <-
function(df)
(length(names_of_colums_with_NA(df)) / ncol(df)) %>%
percent()Compute the percentaje of cols with NA.
percentaje_of_cols_with_NA(df)## [1] "74%"
Inspect if there are columns full of NA.
is_full_of_NA <- function(col){
num_of_NA <-
col %>%
is.na() %>%
sum()
return(num_of_NA == length(col))
}cols_full_of_NA <-
df %>%
select_if(is_full_of_NA) %>%
names()## [1] "authentihash" "scans.Bkav.result"
## [3] "scans.CMC.result" "scans.ALYac.result"
## [5] "scans.Malwarebytes.result" "scans.K7AntiVirus.result"
## [7] "scans.Baidu.result" "scans.SUPERAntiSpyware.result"
## [9] "scans.Gridinsoft.result" "scans.ViRobot.result"
## [11] "scans.BitDefenderTheta.result" "scans.TACHYON.result"
## [13] "scans.VBA32.result" "scans.Zoner.result"
## [15] "scans.Panda.result" "scans.Elastic.result"
## [17] "scans.Cylance.result" "scans.SentinelOne.result"
As can be seen there are many columns that are full of NA, so can be deleted.
df <-
select(df, -all_of(cols_full_of_NA))Colums with the same value
Maybe there are columns that has the same value along all the vector, so are useless.
Define function to remove these columns.
different_values <-
function(x)
x %>% na.omit() %>% unique() %>% length()remove_columns_with_the_same_value <-
function(df)
select_if(df, function(col) different_values(col) > 1)Apply function.
num_of_cols_after_remove <-
df %>%
remove_columns_with_the_same_value() %>%
ncol()Calculate the number of columns with same value.
ncol(df) - num_of_cols_after_remove## [1] 147
Awesom! Many colums found. Let’s remove them.
df <-
remove_columns_with_the_same_value(df)Inspecting dataframe
Now let’s deeply inspect into the dataframe.
View dataframe
View dataframe.
Renaming
The column “…1” is the row number, so “n” will be a better name. The “…JSON” it’s a bad name, just “json” is fine.
df <-
df %>%
rename(n = ...1, json=..JSON)Removing cols
There are many duplicated cols, hashes & dates that can be removed, also many useless.
Dates
There are many dates in the dataset, that are not relevant for virus analysis. So let’s remove them.
Define a function for check if a col is of type Date.
not <-
function(x) !x
get_element <-
function(x, index) x[index]
is_date_col <-
function(col, pattern="^[:digit:]{4}[-:/][:digit:]{2}[-:/][:digit:]{2}")
col %>%
as.character() %>%
na.omit() %>%
get_element(1) %>%
str_detect(pattern)Columns detected.
df %>%
select_if(is_date_col) %>%
head() %>%
paged_table()Define function for remove cols by a predicate.
remove_col_if <-
function(df, fun){
cols_to_delete <-
df %>%
select_if(fun) %>%
colnames()
df <-
df %>%
select(-cols_to_delete)
return(df)
}Remove them.
df <-
remove_col_if(df, is_date_col)## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cols_to_delete)` instead of `cols_to_delete` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
Hashes
There are many hashes cols that don’t really provide useful information. So remove them.
hashes <-
c("")
df <-
df %>%
select(
-vhash,
-sha256,
-sha1,
-scan_id,
-ssdeep,
-md5,
-additional_info.androguard.certificate.serialnumber,
-additional_info.androguard.certificate.thumbprint,
-additional_info.exiftool.ZipCRC
)Scans
There are many scans of different antivirus, that has very similar information. Just keep the scan with less NA values.
Get the best col.
scans_col_names <-
df %>%
colnames() %>%
str_match_all("scans.[:alpha:]*.result") %>%
unlist()
scan_na_by_col <-
df %>%
select(all_of(scans_col_names)) %>%
num_of_NA_by_column()
scan_col_witch_min_na <-
scan_na_by_col %>%
which.min()
best_scanner_colname <-
scan_na_by_col %>%
names() %>%
get_element(scan_col_witch_min_na)
best_scanner_col <-
df %>%
select(all_of(best_scanner_colname))
best_scanner_name <-
best_scanner_colname %>%
str_split("[.]") %>%
unlist() %>%
get_element(2)## [1] "Best scan col: scans.Fortinet.result"
## [1] "Best scanner: Fortinet"
Drop all scans but the best.
col_index_scanners <-
df %>%
colnames() %>%
str_detect("scans") %>%
unlist() %>%
which()
df <-
df %>%
select(-all_of(col_index_scanners)) %>%
cbind(best_scanner_col)Individual columns
Reasons:
- json column contains all the row as JSON.
- permalink is the URL where Virus Total has the virus file.
- Main.Activity & Package are strings with all different values.
- FileTypeExtension, ZipFileName & MIMEType has same values as FileType.
- ZipBitFlag doesn’t seems to matter.
- additional_info.magic has the vesion of the ZIP file, that doesn’t seems to matter.
- Subject.DN is the JSON fragment that has all the information about the subject, but these data are decomposed in the rest of Subject cols.
df <-
df %>%
select(
-json,
-permalink,
-additional_info.androguard.AndroidApplicationInfo,
-additional_info.androguard.Main.Activity,
-additional_info.exiftool.MIMEType,
-additional_info.exiftool.FileTypeExtension,
-additional_info.exiftool.ZipFileName,
-additional_info.magic,
-additional_info.androguard.Package,
-additional_info.androguard.certificate.Subject.DN
)Groups of columns
Define a function to remove cols which name match a pattern.
remove_cols_which_name_match <-
function(df, pattern){
cols_to_remove <-
df %>%
colnames() %>%
str_which(pattern)
df_removed_cols <-
df %>%
select(-all_of(cols_to_remove))
return(df_removed_cols)
}Remove groups.
Reasons:
- Issuer group has the same information as Subject group.
- CompressedView and RiskIndicator.APK groups have the same information as file_type group
df <-
df %>%
remove_cols_which_name_match("^additional_info.androguard.certificate.Issuer.[:alpha:]*$") %>%
remove_cols_which_name_match("^additional_info.compressedview.extensions.[:alpha:]*$") %>%
remove_cols_which_name_match("^additional_info.androguard.RiskIndicator.APK.[:alpha:]*$")View results
Define a function for sort columns.
sort_cols <-
function(df){
df <-
df %>%
select(order(colnames(df)))
additionalInfo_cols_logical <-
df %>%
colnames() %>%
str_detect("additional_info")
additionalInfo_cols <-
df %>%
select(which(additionalInfo_cols_logical))
not_additionalInfo_cols <-
df %>%
select(which(!additionalInfo_cols_logical)) %>%
select(n, size, everything())
return(cbind(not_additionalInfo_cols, additionalInfo_cols))
}Sort columns.
df <-
sort_cols(df)View results.
Replacing values
Replace “Unknown” and “?” by NA
There are some columns that hast the value “Unknown” or “?” instead of NAs. So let’s replace them.
Define a function to replace values in cols that satisfy a predicate.
replace_when <-
function(df, fun, value, replacement){
cols_to_replace <-
df %>%
select_if(fun) %>%
colnames()
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace(col, which(col==value), replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}Replace ? and Unknown for NAs.
df <-
df %>%
replace_when(function(col) any(str_detect(col, fixed("Unknown"))), "Unknown", NA) %>%
replace_when(function(col) any(str_detect(col, fixed("?"))), "?", NA)Replace NA for 0
Permissions colums
The permissions columns (PERM) seems that there are NAs where there should be 0s. So it would be better to replace them.
replace_na_which_colname_match <-
function(df, pattern, replacement){
cols_to_replace <-
df %>%
colnames() %>%
str_which(pattern)
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace_na(col, replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}replace_na_when <-
function(df, fun, replacement){
cols_to_replace <-
df %>%
select_if(fun) %>%
colnames()
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace_na(col, replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}pattern <- "additional_info.androguard.RiskIndicator.PERM"
df <-
df %>%
replace_na_which_colname_match(pattern, 0)
df %>%
select(., str_which(colnames(.), pattern)) %>%
paged_table()New total permissions column
pattern <- "additional_info.androguard.RiskIndicator.PERM"
df_without_permissions <-
df %>%
select(., -(str_which(colnames(.), pattern)))
df_permissions <-
df %>%
select(., str_which(colnames(.), pattern)) %>%
mutate(., total_PERMs = rowSums(.))
df <- cbind(df_without_permissions, df_permissions)View results
Sort columns.
df <-
sort_cols(df)View results.
Save dataframe
After all preprocessing let’s save it into CSV.
write.csv(df, path)Functions for preprocessing
As factor
labels <-
function(n){
if(n == 5){
return(c("very low", "low", "medium", "high", "very high"))
}else if(n == 4){
return(c("very low", "low", "high", "very high"))
}else if(n == 3){
return(c("low", "medium", "high"))
}else if(n == 2){
return(c("low", "high"))
}else{
stop("Not avalible")
}
}
cut_by_quantiles <-
function(col){
quantiles <-
col %>%
quantile(na.rm = TRUE) %>%
unique()
if(length(quantiles) > 2){
col <-
col %>%
cut(breaks = quantiles,
labels = labels(length(quantiles)-1),
include.lowest = TRUE)
}
return(col)
}
df_cut_by_quantiles <-
function(df){
df_without_numeric <-
df[sapply(df, function(col) !is.numeric(col))]
df_numeric <-
df %>%
select_if(is.numeric)
df_numeric <-
df_numeric %>%
lapply(cut_by_quantiles)
return(cbind(df_without_numeric, df_numeric))
}